{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "dir_path = './Amazon_Review/Books/'\n",
    "rating_file = 'ratings_Books.csv'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>item_id</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2IIIDRK3PRRZY</td>\n",
       "      <td>0000000116</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1395619200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>AYEDW3BFK53XK</td>\n",
       "      <td>0000013714</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1325462400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>A2GKR2Q7MD8DG4</td>\n",
       "      <td>0000029831</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1393286400</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           user_id     item_id  rating   timestamp\n",
       "1   A2IIIDRK3PRRZY  0000000116     1.0  1395619200\n",
       "10   AYEDW3BFK53XK  0000013714     5.0  1325462400\n",
       "20  A2GKR2Q7MD8DG4  0000029831     5.0  1393286400"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def read_user_rating_records():\n",
    "    col_names = ['user_id', 'item_id', 'rating', 'timestamp']\n",
    "    data_records = pd.read_csv(dir_path + rating_file, sep=',', names=col_names, engine='python')\n",
    "    return data_records\n",
    "\n",
    "data_records = read_user_rating_records()\n",
    "data_records.head()\n",
    "data_records.iloc[[1, 10, 20]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8026324 2330066\n"
     ]
    }
   ],
   "source": [
    "print(len(data_records['user_id'].value_counts()), len(data_records['item_id'].value_counts()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7118528 2138299\n"
     ]
    }
   ],
   "source": [
    "data_records.loc[data_records.rating < 4, 'rating'] = 0\n",
    "data_records.loc[data_records.rating >= 4, 'rating'] = 1\n",
    "data_records = data_records[data_records.rating > 0]\n",
    "print(len(data_records['user_id'].unique()), len(data_records['item_id'].unique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "users with < 20 interactoins are removed\n",
      "items with < 20 interactoins are removed\n",
      "num of users:76696, num of items:41265\n"
     ]
    }
   ],
   "source": [
    "from copy import deepcopy\n",
    "def remove_infrequent_items(data, min_counts=5):\n",
    "    df = deepcopy(data)\n",
    "    counts = df['item_id'].value_counts()\n",
    "    df = df[df[\"item_id\"].isin(counts[counts >= min_counts].index)]\n",
    "\n",
    "    print(\"items with < {} interactoins are removed\".format(min_counts))\n",
    "    # print(df.describe())\n",
    "    return df\n",
    "\n",
    "def remove_infrequent_users(data, min_counts=10):\n",
    "    df = deepcopy(data)\n",
    "    counts = df['user_id'].value_counts()\n",
    "    df = df[df[\"user_id\"].isin(counts[counts >= min_counts].index)]\n",
    "\n",
    "    print(\"users with < {} interactoins are removed\".format(min_counts))\n",
    "    # print(df.describe())\n",
    "    return df\n",
    "\n",
    "filtered_data = remove_infrequent_users(data_records, 20)\n",
    "filtered_data = remove_infrequent_items(filtered_data, 20)\n",
    "print('num of users:{}, num of items:{}'.format(len(filtered_data['user_id'].unique()), len(filtered_data['item_id'].unique())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "item_id\n",
      "0373876998    20\n",
      "1499295952    20\n",
      "B00KF073C8    20\n",
      "1499293224    20\n",
      "0778315126    20\n",
      "B00G0KFZ7C    20\n",
      "B00KEZV34S    20\n",
      "B00G00GUVM    20\n",
      "1499198469    20\n",
      "B00C1NCUL6    20\n",
      "dtype: int64\n",
      "user_id\n",
      "A18Y4FI13QPQ8V    1\n",
      "A384ODJB85EGS5    1\n",
      "A1GORK6WGLQQF2    1\n",
      "A1GOXX94PDPR59    1\n",
      "A1VFG5SVYMBH7K    1\n",
      "A1VF5LGIODDNG7    1\n",
      "A385OXH06QRH3W    1\n",
      "A5Q65JECU7D1C     1\n",
      "A5PJHEFSM7PZ3     1\n",
      "A388N4UTHDZUI4    1\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(filtered_data.groupby('item_id').size().sort_values(ascending=True)[:10])\n",
    "print(filtered_data.groupby('user_id').size().sort_values(ascending=True)[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['000100039X' '0002007770' '0002051850' '000215725X' '0002219417'\n",
      " '000222383X' '0002226618' '000224053X' '0002242052' '0002247399']\n"
     ]
    }
   ],
   "source": [
    "# read item's reviews\n",
    "item_list = filtered_data['item_id'].unique()\n",
    "item_set = set(item_list)\n",
    "\n",
    "print(item_list[:10])\n",
    "\n",
    "review_file = 'reviews_Books_5.json.gz'\n",
    "\n",
    "import json\n",
    "import gzip\n",
    "\n",
    "def parse(path):\n",
    "    g = gzip.open(path, 'r')\n",
    "    for l in g:\n",
    "        yield json.loads(l)\n",
    "        # yield json.dumps(eval(l))\n",
    "\n",
    "review_dict = dict()  # [review_id] = review_text\n",
    "review_helpful = dict()\n",
    "for l in parse(dir_path + review_file):\n",
    "    if l['asin'] in item_set:\n",
    "        if l['asin'] in review_dict:\n",
    "            if l['helpful'][0] / float(l['helpful'][1] + 0.01) > review_helpful[l['asin']] and len(l['reviewText']) > 10:\n",
    "                review_dict[l['asin']] = l['reviewText']\n",
    "                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)\n",
    "        else:\n",
    "            if len(l['reviewText']) > 10:\n",
    "                review_dict[l['asin']] = l['reviewText']\n",
    "                review_helpful[l['asin']] = l['helpful'][0] / float(l['helpful'][1] + 0.01)\n",
    "\n",
    "# print review_dict['1300966947']\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n",
      "41265\n"
     ]
    }
   ],
   "source": [
    "# delete items without reviews\n",
    "item_without_review = []\n",
    "for item_id in item_list:\n",
    "    if item_id not in review_dict:\n",
    "        item_without_review.append(item_id)\n",
    "\n",
    "print(item_without_review)\n",
    "\n",
    "for item_id in item_without_review:\n",
    "    filtered_data = filtered_data[filtered_data['item_id'] != item_id]\n",
    "\n",
    "item_list = filtered_data['item_id'].unique()\n",
    "print(len(item_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "for item_id, review in review_dict.items():\n",
    "    if len(review) < 5:\n",
    "        print(item_id)\n",
    "# print review_dict['B002IUAUI2']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "succressfully created sequencial data! head: user_id\n",
      "A002359833QJM7OQHCXWY    [B00BUKRALG, B00BWY3UKU, B004OEKH7Y, 076420477...\n",
      "A00463782V7TKAP9EMNL     [B00ES4C28C, 1941450008, B004XJ6922, 148481477...\n",
      "A0047322388NOTO4N8SKD     [0884191338, 0849947200, 0333516516, 1589267109]\n",
      "A00579222Q4YKY0J53RLA    [193415766X, 0345492641, 1451608160, 193639924...\n",
      "A006458827ALF2J23JJTO    [1489539042, 1482616319, B00DBE8QDU, B00DUFCJ1...\n",
      "Name: item_id, dtype: object\n",
      "user_id\n",
      "A002359833QJM7OQHCXWY    [B00BUKRALG, B00BWY3UKU, B004OEKH7Y, 076420477...\n",
      "A00463782V7TKAP9EMNL     [B00ES4C28C, 1941450008, B004XJ6922, 148481477...\n",
      "A00579222Q4YKY0J53RLA    [193415766X, 0345492641, 1451608160, 193639924...\n",
      "A006458827ALF2J23JJTO    [1489539042, 1482616319, B00DBE8QDU, B00DUFCJ1...\n",
      "A0092581WFYQNV4KMUZ3     [0425263916, 0615744257, 0060734019, 032147404...\n",
      "A0099735VDZ3HDCAAYKL     [0451228219, 0451229444, B008XOWVVG, B006BFX4U...\n",
      "A010971113OD625HDB6X8    [0606262520, 0985023058, 0373210515, 148270683...\n",
      "A010997525FU27TAPMJCG    [0451165209, 1442346272, 0470101474, 039331929...\n",
      "A01628721NLXK7ENDWDC9    [B00558WOZG, 1425746845, B004WLOGYE, B004Z1N2G...\n",
      "A01631062UX24GI4LJKF     [B00EOUWEW4, B00D6IAJHM, B00FXSDNX0, B00G002LH...\n",
      "Name: item_id, dtype: object\n",
      "52406\n",
      "<class 'pandas.core.series.Series'>\n"
     ]
    }
   ],
   "source": [
    "# convert records to sequential data per user\n",
    "def convert_data(data):\n",
    "    # for each user, sort by timestamps\n",
    "    df = deepcopy(data)\n",
    "    df_ordered = df.sort_values(['timestamp'], ascending=True)\n",
    "    data = df_ordered.groupby('user_id')['item_id'].apply(list)\n",
    "    #print(data)\n",
    "    #time_l = df_ordered.groupby('user')['checkin_time'].apply(list)\n",
    "    #print(time_l)\n",
    "    print(\"succressfully created sequencial data! head:\", data.head(5))\n",
    "    unique_data = df_ordered.groupby('user_id')['item_id'].nunique()\n",
    "    data = data[unique_data[unique_data >= 10].index]\n",
    "    print(data[:10])\n",
    "    print(len(data))\n",
    "    return data\n",
    "\n",
    "seq_data = convert_data(filtered_data)\n",
    "print(type(seq_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "52406 41264\n"
     ]
    }
   ],
   "source": [
    "user_item_dict = seq_data.to_dict()\n",
    "user_mapping = []\n",
    "item_set = set()\n",
    "for user_id, item_list in seq_data.iteritems():\n",
    "    user_mapping.append(user_id)\n",
    "    for item_id in item_list:\n",
    "        item_set.add(item_id)\n",
    "item_mapping = list(item_set)\n",
    "\n",
    "print(len(user_mapping), len(item_mapping))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[2943, 9318, 6295, 3211, 30412, 37719, 5338, 39088, 3321, 22536, 6107], [4010, 9926, 17692, 32858, 38332, 33292, 29533, 19637, 24784, 6624, 35398, 5187, 6639, 31289, 5330, 10625, 21043, 10026, 34328, 38276], [13874, 32449, 195, 38186, 18105, 25384, 31149, 25300, 34987, 33252, 4178, 13725, 22148], [33172, 36596, 6106, 7487, 40512, 3205, 24847, 23494, 28024, 14166, 37537, 1002, 9174, 28901, 33507, 24688, 6719, 25446, 38409, 9335, 28950, 34620, 34999, 3066, 40394, 7835, 1697, 6750, 37529, 19262, 10556, 28446, 31307, 6631, 12231, 28245, 40699, 8256], [4217, 24644, 13110, 32465, 27742, 8263, 34335, 12933, 29505, 21141, 20076, 4066, 25569, 32584, 13665, 13271, 25060, 8452, 7648]]\n"
     ]
    }
   ],
   "source": [
    "def generate_inverse_mapping(data_list):\n",
    "    inverse_mapping = dict()\n",
    "    for inner_id, true_id in enumerate(data_list):\n",
    "        inverse_mapping[true_id] = inner_id\n",
    "    return inverse_mapping\n",
    "\n",
    "def convert_to_inner_index(user_records, user_mapping, item_mapping):\n",
    "    inner_user_records = []\n",
    "    user_inverse_mapping = generate_inverse_mapping(user_mapping)\n",
    "    item_inverse_mapping = generate_inverse_mapping(item_mapping)\n",
    "\n",
    "    for user_id in range(len(user_mapping)):\n",
    "        real_user_id = user_mapping[user_id]\n",
    "        item_list = list(user_records[real_user_id])\n",
    "        for index, real_item_id in enumerate(item_list):\n",
    "            item_list[index] = item_inverse_mapping[real_item_id]\n",
    "        inner_user_records.append(item_list)\n",
    "\n",
    "    return inner_user_records, user_inverse_mapping, item_inverse_mapping\n",
    "\n",
    "inner_data_records, user_inverse_mapping, item_inverse_mapping = convert_to_inner_index(user_item_dict, user_mapping, item_mapping)\n",
    "print(inner_data_records[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "def save_obj(obj, name ):\n",
    "    with open(name + '.pkl', 'wb') as f:\n",
    "        pickle.dump(obj, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_obj(inner_data_records, 'Books_item_sequences')\n",
    "save_obj(user_mapping, 'Books_user_mapping')\n",
    "save_obj(item_mapping, 'Books_item_mapping')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import csr_matrix\n",
    "\n",
    "def generate_rating_matrix(train_set, num_users, num_items):\n",
    "    # three lists are used to construct sparse matrix\n",
    "    row = []\n",
    "    col = []\n",
    "    data = []\n",
    "    for user_id, article_list in enumerate(train_set):\n",
    "        for article in article_list:\n",
    "            row.append(user_id)\n",
    "            col.append(article)\n",
    "            data.append(1)\n",
    "\n",
    "    row = np.array(row)\n",
    "    col = np.array(col)\n",
    "    data = np.array(data)\n",
    "    rating_matrix = csr_matrix((data, (row, col)), shape=(num_users, num_items))\n",
    "\n",
    "    return rating_matrix\n",
    "\n",
    "rating_matrix = generate_rating_matrix(inner_data_records, len(user_mapping), len(item_mapping))\n",
    "rating_matrix = rating_matrix.transpose()\n",
    "\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "relation_matrix = cosine_similarity(rating_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_matrix.nnz / float(len(user_mapping) * len(item_mapping))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "np.fill_diagonal(relation_matrix, 0)\n",
    "max_count = 0\n",
    "for i in range(len(item_mapping)):\n",
    "    max_count = max(np.count_nonzero((relation_matrix[i] >= 0.2) == True), max_count)\n",
    "    \n",
    "print max_count\n",
    "\n",
    "count = 0\n",
    "for i in range(len(item_mapping)):\n",
    "    if np.count_nonzero((relation_matrix[i] >= 0.2) == True) > 0:\n",
    "        count += 1\n",
    "\n",
    "print count\n",
    "print np.max(relation_matrix)\n",
    "print relation_matrix[0]\n",
    "print relation_matrix[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "relation_matrix[relation_matrix < 0.2] = 0\n",
    "relation_matrix[relation_matrix > 0] = 1\n",
    "relation_matrix = csr_matrix(relation_matrix)\n",
    "print len(user_mapping), len(item_mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# process review content\n",
    "import re\n",
    "import nltk\n",
    "\n",
    "# generate the whole document\n",
    "all_review = []\n",
    "for item_id in item_mapping:\n",
    "    all_review.append([review_dict[item_id]])\n",
    "\n",
    "# use nltk to remove stopwords, and stemming each word\n",
    "from nltk.corpus import stopwords\n",
    "stopwords_set = set(stopwords.words('english'))\n",
    "porter_stemmer = nltk.PorterStemmer()\n",
    "\n",
    "review_str = []\n",
    "for i, movie in enumerate(all_review):\n",
    "    # Use regular expressions to do a find-and-replace\n",
    "    letters_only = re.sub(\"[^a-zA-Z]\",  # The pattern to search for\n",
    "                          \" \",  # The pattern to replace it with\n",
    "                          movie[0])  # The text to search\n",
    "    # print letters_only\n",
    "\n",
    "    letters_only = letters_only.lower()\n",
    "    tokens = nltk.word_tokenize(letters_only)\n",
    "\n",
    "    tokens = [w for w in tokens if w.lower() not in stopwords_set]\n",
    "    # print tokens\n",
    "\n",
    "    porter = [porter_stemmer.stem(t) for t in tokens]\n",
    "    # print porter\n",
    "    all_review[i] = porter\n",
    "    review_str.append(' '.join(porter))\n",
    "\n",
    "print review_str[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert to bag-of-words\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "vectorizer = CountVectorizer(analyzer=\"word\", tokenizer=None, preprocessor=None, stop_words=None, min_df=3)\n",
    "word_counts = vectorizer.fit_transform(review_str)\n",
    "vocab = vectorizer.get_feature_names()\n",
    "\n",
    "print len(vocab)\n",
    "print word_counts.data.max()\n",
    "print word_counts.data.min()\n",
    "print len(item_mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rating_matrix.nnz / float(len(user_mapping) * len(item_mapping))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# store bag-of-words to file\n",
    "def vocabulary_to_file(vocab):\n",
    "    f0 = open('vocabulary.txt', 'w')\n",
    "\n",
    "    for word in vocab:\n",
    "        f0.write(word + '\\n')\n",
    "    f0.close()\n",
    "\n",
    "\n",
    "def word_count_to_file(item_list, word_count):\n",
    "    f0 = open('word_counts.txt', 'w')\n",
    "    for i, document in enumerate(word_count):\n",
    "        indices = document.indices\n",
    "        counts = document.data\n",
    "        num_words = document.count_nonzero()\n",
    "\n",
    "        f0.write(str(item_list[i]) + ' ' + str(num_words))\n",
    "        for j, indice in enumerate(indices):\n",
    "            f0.write(' ' + str(indice) + ':' + str(counts[j]))\n",
    "        f0.write('\\n')\n",
    "    f0.close()\n",
    "\n",
    "vocabulary_to_file(vocab)\n",
    "word_count_to_file(item_mapping, word_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "def save_obj(obj, name ):\n",
    "    with open(name + '.pkl', 'wb') as f:\n",
    "        pickle.dump(obj, f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "save_obj(user_item_dict, 'Books_user_records')\n",
    "save_obj(user_mapping, 'Books_user_mapping')\n",
    "save_obj(item_mapping, 'Books_item_mapping')\n",
    "save_obj(relation_matrix, 'item_relation')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print vocab[:10]\n",
    "print all_review[-1]\n",
    "print review_str[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_to_index = dict()\n",
    "for w_id, word in enumerate(vocab):\n",
    "    word_to_index[word] = w_id\n",
    "\n",
    "all_review_index = []\n",
    "for i in range(len(review_str)):\n",
    "    cur_review = review_str[i].split(' ')\n",
    "    cur_index = []\n",
    "    for word in cur_review:\n",
    "        if word in word_to_index:\n",
    "            cur_index.append(word_to_index[word])\n",
    "    all_review_index.append(cur_index)\n",
    "    \n",
    "print all_review_index[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# store word sequence to a file\n",
    "save_obj(all_review_index, 'review_word_sequence')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print seq_data[-1]\n",
    "user_inverse_mapping = generate_inverse_mapping(user_mapping)\n",
    "item_inverse_mapping = generate_inverse_mapping(item_mapping)\n",
    "print user_item_dict[user_mapping[-1]]\n",
    "tmp = []\n",
    "for item_id in seq_data[-1]:\n",
    "    tmp.append(item_inverse_mapping[item_id])\n",
    "print sorted(tmp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print all_review[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(word_counts.shape[0]):\n",
    "    if word_counts.getrow(i).getnnz() == 0:\n",
    "        print i"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
